################################################################################
##
## Purpose: This script assembles the final dataset and does some final cleaning.
##
## Author: James Bisbee (james.h.bisbee@vanderbilt.edu)
##
##  - Inputs:
##    - ./data/prepped/finalData_for_NLP.RData: Prepped data from 6_DATA_intermediate_build.R
##    - ./data/prepped/hearings/toxicity_resultsFull.RData: Prepped data from 8_DATA_toxicity_prep.R (not run)
##    - ./data/prepped/hearings/topic_models_100.RData: Prepped data from 7_DATA_topic_model_prep.R
##    - ./data/prepped/hearings/topic_models_grp_spkr.RData: Prepped data from 7_DATA_topic_model_prep.R
##  - Outputs:
##    - ./data/prepped/finalData.RData
##
##
## See associated log file for compute environment, package versions, 
##  and date of most recent run.
##
################################################################################
rm(list = ls())
gc()
require(tidyverse)
set.seed(123)

# Compute details
print(paste0('Compute environment from ',Sys.Date(),' run by Bisbee'))
if(Sys.info()['sysname'] == 'Windows') {
  ram_size = system("wmic MemoryChip get Capacity", intern = TRUE)[-1]
  model_name = system("wmic cpu get name", intern = TRUE)[2] # nocov
  vendor_id = system("wmic cpu get manufacturer", intern = TRUE)[2] # nocov
  
  print(list(ram = stringr::str_squish(ram_size)[1],
             vendor_id = stringr::str_squish(vendor_id),
             model_name = stringr::str_squish(model_name),
             no_of_cores = parallel::detectCores()))
} else if(Sys.info()['sysname'] == 'Linuxs') {
  splitted <- strsplit(system("ps -C rsession -o %cpu,%mem,pid,cmd", intern = TRUE), " ")
  df <- do.call(rbind, lapply(splitted[-1], 
                              function(x) data.frame(
                                cpu = as.numeric(x[2]),
                                mem = as.numeric(x[4]),
                                pid = as.numeric(x[5]),
                                cmd = paste(x[-c(1:5)], collapse = " "))))
  df
} else {
  cat("If not on Linux or Windows, you'll have to figure out your own solution to seeing the compute environment.")
}

sessionInfo()



# -------------------------------------------------------------- Merging Step 2
# Picking back up here after the toxicity and topic models have been prepared
load('./data/prepped/finalData_for_NLP.RData')

# Loading in the toxicity data: Prepared by ./code/0_DATA_toxicity_prep.R
load('./data/prepped/hearings/toxicity_resultsFull.RData')

# Loading in the topic model data: Prepared by ./code/0_DATA_topic_model_prep.R
load('./data/prepped/hearings/topic_models_100.RData')

# Merging with toxicity
toxic_resFull <- toxic_resFull %>% 
  rowwise() %>%
  mutate(combAttack = sum(ATTACK_ON_AUTHOR,IDENTITY_ATTACK,INSULT,THREAT,na.rm=T),
         combToxic = sum(TOXICITY,PROFANITY,SEXUALLY_EXPLICIT,FLIRTATION,INFLAMMATORY,OBSCENE,na.rm=T),
         combIncoh = sum(INCOHERENT,UNSUBSTANTIAL,na.rm=T))

colnames(toxic_resFull) <- paste0('SENT_',colnames(toxic_resFull))

finalMerge <- finalMerge %>%
  left_join(toxic_resFull,by = c('fullInd' = 'SENT_text_id'))


# Merging with topic model loadings
finalMerge <- finalMerge %>%
  left_join(doc_topic_distr %>% spread(topic,theta,sep = '_'),by = c('fullInd' = 'id'))

colnames(finalMerge)

# Preparing measures
utterance_level <- finalMerge %>%
  mutate(date = as.Date(gsub('fed|\\.txt','',docID))) %>%
  arrange(docID,fullInd,ind) %>%
  mutate(interruptor = ifelse(grepl('--$',lag(textclean)),1,0),  # Interruptions are characterized by two or more hyphens, followed by an end of line
         interrupted = ifelse(grepl('--$',textclean),1,0)) %>%
  mutate(respondingTo = lag(opensecretsID),
         interruptedBy = lag(opensecretsID)) %>%
  group_by(opensecretsID) %>%
  mutate(any = sum(interrupted) > 0) %>% 
  ungroup() %>%
  mutate(opensecretsID = relevel(factor(opensecretsID),ref = 'FEDBERNANKE')) %>%
  mutate(fed = ifelse(grepl('^FED',opensecretsID),1,0), # Indicator for whether the speaker is a Fed chair
         fedResp = ifelse(grepl('^FED',respondingTo),1,0), # Indicator for whether the person being spoken to is a Fed chair
         yellen = ifelse(grepl('YELLEN',respondingTo),1,0), # Indicator for whether the person being spoken to Yellen
         yellenTime = year %in% 2014:2017) %>% # Indicator for whether the hearing is attended by Yellen
  group_by(fed,docID) %>%
  mutate(mind = ifelse(nchars == max(nchars*fed),ind,NA)) %>% # A shortcut to identify the opening statement by the Fed chair (slice out the longest utterance by the Fed chair...NB THIS MIGHT NOT ALWAYS BE CORRECT)
  ungroup() %>%
  group_by(docID) %>%
  arrange(ind) %>%
  fill_('mind',.direction = 'updown') %>%
  # ungroup() %>%
  mutate_at(vars(matches('topic_|nchars|SENT_|yellen_vote|constrain_|cent_decent|oversight_'),
                 age,seniority,votepct,gender,party,nominate_dim1),list(lag = ~lag(.))) %>% # Want nchars, topic loadings, and toxicity measures for the preceding utterance, as well as the covariates of whoever interrupted
  ungroup() %>%
  group_by(opensecretsID) %>%
  mutate(all = n()) %>% 
  ungroup() %>%
  mutate_at(vars(matches('lag')),function(x) ifelse(is.na(x),0,x)) %>% # If the lagged measure is NA, replace with zero.
  mutate(DEM = ifelse(party == 'D',1,0), # Dummies for party of speaker
         GOP = ifelse(party == 'R',1,0),
         Male = ifelse(gender == 'M',1,0)) %>% # Dummy for gender of speaker
  group_by(opensecretsID,docID) %>%
  mutate(tot_utterances = n()) %>%
  ungroup() %>%
  group_by(docID) %>%
  arrange(ind) %>%
  mutate(tot_utterances_lag = lag(tot_utterances)) %>%
  ungroup()

utterance_level <- utterance_level %>%
  mutate(name = ifelse(opensecretsID == 'EXPERTKOHN','DONALD KOHN',
                       ifelse(opensecretsID == 'N00003218','HAROLD E. FORD JR.',
                              ifelse(opensecretsID == 'N00006267','MICHAEL CRAPO',
                                     ifelse(opensecretsID == 'N00026627','PATRICK T. MCHENRY',
                                            ifelse(opensecretsID == 'N00029070','JAMES A. HIMES',
                                                   ifelse(opensecretsID == 'N00029277','GARY C. PETERS',name)))))))


# Create a speaker-by-hearing version of the dataset
speaker_level <- utterance_level %>%
  filter(ind > mind) %>% # Drop the utterances up to the opening statement by the Fed chair
  group_by(opensecretsID,party,gender,speaker,respondingTo,fedResp,yellenTime,
           stab,docID,chamber,nDaughters,nKids,firstDaughter,nSons,
           constrain_empower_tot,oversight_indep_tot,yellen_vote,votepct,
           seniority,age,nominate_dim1,fed,year,date,tot_utterances)  %>%
  summarise(denom = n(), # Total number of utterances by speaker by hearing
            interrupted = sum(interrupted,na.rm=T), # Total number of utterances that are interrupted by speaker by hearing
            interruptor = sum(interruptor,na.rm=T), # Total number of utterances that interrupt by speaker by hearing
            interruptedPct = interrupted*100/denom, # Proportions of utterances that are interrupted by speaker by hearing
            interruptorPct = interruptor*100/denom, # Proportions of utterances that interrupt by speaker by hearing
            combAttack = sum(SENT_combAttack,na.rm=T)/denom, # The proportion of all utterances that are an attack, by speaker by hearing
            combIncoh = sum(SENT_combIncoh,na.rm=T)/denom, # The proportion of all utterances that are incoherent, by speaker by hearing
            combToxic = sum(SENT_combToxic,na.rm=T)/denom) %>% # The proportion of interruptions that are toxic, by speaker by hearing
  ungroup() %>%
  distinct() %>%
  mutate(respondingTo = relevel(factor(respondingTo),ref = 'FEDBERNANKE'))


# Adding in the new topics from the revisions
load('./data/prepped/hearings/topic_models_grp_spkr.RData')
utterance_level <- utterance_level %>%
  left_join(topics_toMerge)

save(utterance_level,speaker_level,file = './data/prepped/finalData.RData')

# EOF